library(tidyverse)
read_files <- function (dir) {
setwd(dir)
files <- list.files(pattern="_output_")
mylist <- lapply(files, function(x) {
load(file = x)
get(ls()[ls()!= "filename"])
})
all <- do.call("bind_rows", mylist)
return(all)
}
#load in files
all <- read_files("~/Desktop/Cluster/MLSims_Revisions/23MayResults")
nrow(all) #to make sure every iteration ran
## [1] 4981
nrow(all %>% select(-iteration) %>% distinct())
## [1] 4981
length(unique(all$seed)) #to make sure there are no duplicates
## [1] 4981
#checking number of iterations
table(all$scenario)
##
## 1a 1b 2
## 2321 2075 585
all <- all %>%
mutate(scenario = factor(scenario, levels=c("1a","1b","2")),
sc_combo = paste(K, ns, cov_shift, study_sd, study_inter_sd, scenario, sep="_"),
sc_combo = factor(sc_combo))
table(all$sc_combo)
##
## 10_half and half_no_1_0.5_1a 10_half and half_no_1_0.5_1b
## 188 192
## 10_half and half_no_NA_NA_2 10_one large_no_1_0.5_1a
## 194 197
## 10_one large_no_1_0.5_1b 10_one large_no_NA_NA_2
## 194 190
## 10_one large_yes_1_0.5_1a 10_one large_yes_1_0.5_1b
## 187 218
## 10_same_no_0.5_0_1a 10_same_no_0.5_0_1b
## 299 201
## 10_same_no_1_0_1a 10_same_no_1_0_1b
## 292 209
## 10_same_no_1_0.5_1a 10_same_no_1_0.5_1b
## 252 204
## 10_same_no_1_1_1a 10_same_no_1_1_1b
## 257 208
## 10_same_no_3_1_1a 10_same_no_3_1_1b
## 215 196
## 10_same_no_NA_NA_2 10_same_yes_1_0.5_1a
## 201 204
## 10_same_yes_1_0.5_1b 30_same_no_1_0.5_1a
## 214 230
## 30_same_no_1_0.5_1b
## 239
#get means and sds per setting
mses <- all %>%
group_by(K, ns, cov_shift, study_sd, study_inter_sd, scenario, honesty, sc_combo) %>%
summarise(across(x_nostudy:ma, list(mean=mean, sd=sd)),
n_iter = n())
#make long data for plotting, just with means
mses_long <- all %>%
group_by(K, ns, cov_shift, study_sd, study_inter_sd, scenario, honesty, sc_combo) %>%
summarise(across(x_nostudy:ma, mean),
n_iter = n()) %>%
pivot_longer(cols=x_nostudy:ma,
names_to="Method",
values_to="MSE") %>%
mutate(MSE = as.numeric(MSE),
base = case_when(grepl("x_",Method)==T ~ "X-Learner",
grepl("s_",Method)==T ~ "S-Learner",
grepl("causal_",Method)==T ~ "Causal Forest",
grepl("ma",Method)==T ~ "Meta-Analysis"),
ensemble = case_when(grepl("nostudy",Method)==T ~ "Complete Pooling",
grepl("studyind",Method)==T ~ "Trial Indicator",
grepl("tree", Method)==T ~ "Ensemble Tree",
grepl("forest",Method)==T ~ "Ensemble Forest",
grepl("lasso",Method)==T ~ "Ensemble Lasso",
grepl("ss",Method)==T ~ "Single-Study",
grepl("ma",Method)==T ~ "Meta-Analysis"),
base = factor(base, levels=c("S-Learner", "X-Learner", "Causal Forest", "Meta-Analysis")),
ensemble = factor(ensemble, levels=c("Single-Study", "Complete Pooling", "Trial Indicator", "Ensemble Tree",
"Ensemble Forest","Ensemble Lasso", "Meta-Analysis")),
sds = paste(study_sd, study_inter_sd, sep=", "), sds = factor(sds),
scenario = ifelse(scenario=="1a", "Piecewise Linear CATE",
ifelse(scenario=="1b", "Non-linear CATE",
"Variable CATE")),
scenario = factor(scenario, levels=c("Piecewise Linear CATE",
"Non-linear CATE",
"Variable CATE")))
First consider the main settings.
#Figure 1
mses_long %>%
filter(cov_shift == "no", K==10, ns=="same") %>%
ggplot(aes(x=sds, y=MSE, group=Method, color=ensemble)) +
geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
#geom_line() +
facet_wrap(~scenario, scales='free') +
scale_x_discrete(labels = c("Low-Low","Med-Low", "Med-Med", "Med-High","High-High")) +
scale_y_continuous(limits = c(0, 2.1)) +
labs(shape="Single-Study Method", color="Aggregation Method") +
guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
#theme(axis.text.x = element_text(angle = 45)) +
xlab("SD of Study Main and Study Interaction Coefficients") +
theme(text = element_text(size=12))
## Warning: Removed 6 rows containing missing values (geom_point).
#ggsave("Plots/MLSims_Fig1_14Feb2023.jpeg",width=14,height=5,units="in")
mses_long %>%
filter(cov_shift == "no", K==10, sds=="1, 0.5") %>%
ggplot(aes(x=ns, y=MSE, group=Method, color=ensemble)) +
geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
#geom_line() +
facet_wrap(~scenario, scales='free') +
scale_y_continuous(limits = c(0, 2.1)) +
labs(shape="Single-Study Method", color="Aggregation Method") +
guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
#theme(axis.text.x = element_text(angle = 45)) +
xlab("Trial Sizes") +
theme(text = element_text(size=12))
mses_long %>%
filter(K==30) %>%
ggplot(aes(x=scenario, y=MSE, group=Method, color=ensemble)) +
geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
#geom_line() +
scale_y_continuous(limits = c(0, 2.1)) +
labs(shape="Single-Study Method", color="Aggregation Method") +
guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
#theme(axis.text.x = element_text(angle = 45)) +
xlab("Scenario") + ggtitle("K=30 Trials") +
theme(text = element_text(size=12))
mses_long %>%
filter(cov_shift == "yes") %>%
ggplot(aes(x=ns, y=MSE, group=Method, color=ensemble)) +
geom_jitter(aes(shape=base), size=2.5, width=.2, height=0) +
#geom_line() +
facet_wrap(~scenario) +
labs(shape="Single-Study Method", color="Aggregation Method") +
guides(shape=guide_legend(order=1), color=guide_legend(order=2)) +
#theme(axis.text.x = element_text(angle = 45)) +
xlab("Trial Sizes") + ggtitle("Covariate Shift") +
theme(text = element_text(size=12))
#Figure 1 boxplot option
all_long <- all %>%
pivot_longer(cols=x_nostudy:ma,
names_to="Method",
values_to="MSE") %>%
mutate(MSE = as.numeric(MSE),
base = case_when(grepl("x_",Method)==T ~ "X-Learner",
grepl("s_",Method)==T ~ "S-Learner",
grepl("causal_",Method)==T ~ "Causal Forest",
grepl("ma",Method)==T ~ "Meta-Analysis"),
ensemble = case_when(grepl("nostudy",Method)==T ~ "Complete Pooling",
grepl("studyind",Method)==T ~ "Trial Indicator",
grepl("tree", Method)==T ~ "Ensemble Tree",
grepl("forest",Method)==T ~ "Ensemble Forest",
grepl("lasso",Method)==T ~ "Ensemble Lasso",
grepl("ss",Method)==T ~ "Single-Study",
grepl("ma",Method)==T ~ "Meta-Analysis"),
base = factor(base, levels=c("S-Learner", "X-Learner", "Causal Forest", "Meta-Analysis")),
ensemble = factor(ensemble, levels=c("Complete Pooling", "Single-Study", "Trial Indicator", "Ensemble Tree",
"Ensemble Forest","Ensemble Lasso", "Meta-Analysis")),
sds = factor(paste(study_sd, study_inter_sd, sep=", ")),
scenario = ifelse(scenario=="1a", "Piecewise Linear CATE",
ifelse(scenario=="1b", "Non-linear CATE",
"Variable CATE")),
scenario = factor(scenario, levels=c("Piecewise Linear CATE",
"Non-linear CATE",
"Variable CATE")))
First let’s look at our original setups.
#removing scenario 2 (variable CATE)
all_long %>%
filter(cov_shift == "no", ns=="same", K==10, scenario != "Variable CATE") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_grid(sds~scenario, scales='free') +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12))
#removing also high variability and complete pooling
all_long %>%
filter(cov_shift == "no", ns=="same", K==10, scenario != "Variable CATE",
ensemble != "Complete Pooling", sds != "3, 1") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_grid(sds~scenario) +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12))
all_long %>%
filter(cov_shift == "no", K==10, sds=="1, 0.5") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_grid(ns~scenario, scales='free') +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12))
#remove complete pooling
all_long %>%
filter(cov_shift == "no", K==10, sds=="1, 0.5", ensemble != "Complete Pooling") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_grid(ns~scenario, scales='free') +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12))
#switch x-axis
all_long %>%
filter(cov_shift == "no", K==10, sds=="1, 0.5", ensemble != "Complete Pooling") %>%
ggplot(aes(x=base, y=MSE)) +
geom_boxplot(aes(color=ensemble)) +
facet_grid(ns~scenario) +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Aggregation Method") + xlab("Single-Study Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12))
Now consider when K=30.
all_long %>%
filter(K==30) %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_wrap(~scenario) +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
ggtitle("K=30") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12))
Now consider covariate shift.
all_long %>%
filter(cov_shift=="yes") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_grid(ns~scenario) +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12)) +
ggtitle("Covariate Shift")
#remove complete pooling
all_long %>%
filter(cov_shift=="yes", ensemble != "Complete Pooling") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_grid(ns~scenario) +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12)) +
ggtitle("Covariate Shift")
Finally, consider scenario 2.
all_long %>%
filter(scenario == "Variable CATE") %>%
ggplot(aes(x=ensemble, y=MSE)) +
geom_boxplot(aes(color=base)) +
facet_wrap(~ns) +
#scale_y_continuous(limits = c(0, 2.1)) +
labs(color="Single-Study Method") + xlab("Aggregation Method") +
theme(axis.text.x = element_text(angle=90),
text = element_text(size=12)) +
ggtitle("Variable CATE")
#Figure 2
mses_long %>%
group_by(Method, ensemble, base) %>%
summarise(MSE=mean(MSE)) %>%
ggplot(aes(x=ensemble, y=MSE, group=1, color=base)) +
geom_point(size=5) +
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=1),
plot.margin=margin(10,10,10,30),
text = element_text(size=15)) +
labs(color="Single-Study Approach") +
xlab("Aggregation Approach")
## `summarise()` has grouped output by 'Method', 'ensemble'. You can override
## using the `.groups` argument.
#ggsave("Plots/MLSims_Fig2_14Feb2023.jpeg",width=8,height=5,units="in")
## anova
mod_params <- lm(MSE ~ factor(base) + factor(ensemble) + factor(study_sd) +
factor(study_inter_sd) + factor(scenario) +
factor(base)*factor(ensemble),
data=filter(means_long, base != "Meta-Analysis", scenario != 2))
summary(mod_params)
anova(mod_params)
anov <- aov(MSE ~ factor(base) + factor(ensemble) + factor(study_sd) +
factor(study_inter_sd) + factor(scenario) +
factor(base)*factor(ensemble),
data=filter(means_long, base != "Meta-Analysis", scenario != 2))
TukeyHSD(anov, 'factor(base)', conf.level=0.95)
TukeyHSD(anov, 'factor(ensemble)', conf.level=0.95)
TukeyHSD(anov, 'factor(study_sd)', conf.level=0.95)
TukeyHSD(anov, 'factor(study_inter_sd)', conf.level=0.95)
TukeyHSD(anov, 'factor(base):factor(ensemble)', conf.level=0.95)
#results table
tab <- sd_mses %>%
mutate(sc_combo = paste(scenario, study_sd, study_inter_sd, sep="_"),
sc_combo = factor(sc_combo)) %>%
select(s_nostudy, x_nostudy, causal_nostudy, s_studyind, x_studyind, causal_studyind,
s_tree, x_tree, causal_tree, s_forest, x_forest, causal_forest,
s_lasso, x_lasso, causal_lasso, ma) %>%
t()
colnames(tab) <- c("Low-Low","Medium-Low","Medium-Medium",
"Medium-High","High-High","Low-Low",
"Medium-Low","Medium-Medium","Medium-High",
"High-High","")
rownames(tab) <- c("S - Pool","X - Pool", "CF - Pool", "S - Indicator",
"X - Indicator", "CF - Indicator", "S - Tree", "X - Tree",
"CF - Tree", "S - Forest", "X - Forest", "CF - Forest",
"S - Lasso", "X - Lasso", "CF - Lasso", "Meta-Analysis")
#print(tab)
rows <- c()
for (j in 1:nrow(tab)) {
res <- rownames(tab)[j]
for (i in 1:ncol(tab)) {
res <- paste(res, tab[j,i], sep=" & ")
}
rows[j] <- res
}
rows
# library(knitr)
# library(kableExtra)
# kable(tab,"html") %>%
# column_spec(1:11, border_right = T) %>%
# column_spec(12, width = "30em") %>%
# add_header_above(c("","Scenario 1a"=5, "Scenario 1b"=5, "Scenario 2"=1)) %>%
# kable_styling()